{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1.抽取只在test和train中出现的数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.1导入工具包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "#保存数据\n",
    "import pickle\n",
    "\n",
    "import itertools\n",
    "#处理事件字符串\n",
    "import datetime\n",
    "\n",
    "import numpy as np\n",
    "import scipy.io as sio\n",
    "#加工稀疏矩阵\n",
    "import scipy.sparse as ss\n",
    "\n",
    "#相似度/距离\n",
    "import scipy.spatial.distance as ssd\n",
    "\n",
    "#KEY不存在时不报ERROR的dict\n",
    "from collections import defaultdict\n",
    "from sklearn.preprocessing import normalize"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.2记录出现的user和events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of uniqueUsers :3391\n",
      "number of uniqueEvents :13418\n"
     ]
    }
   ],
   "source": [
    " \"\"\"\n",
    "我们只关心train和test中出现的user和event，因此重点处理这部分关联数据\n",
    "\n",
    "train.csv 有6列：\n",
    "user：用户ID\n",
    "event：活动ID\n",
    "invited：是否被邀请（0/1）\n",
    "timestamp：ISO-8601 UTC格式时间字符串，表示用户看到该活动的时间\n",
    "interested, and not_interested\n",
    "\n",
    "Test.csv 除了没有interested, and not_interested，其余列与train相同\n",
    " \"\"\"\n",
    "    \n",
    "# 统计训练集中有多少不同的用户的events\n",
    "uniqueUsers = set()\n",
    "uniqueEvents = set()\n",
    "\n",
    "dataPath = '../JupyterData/'\n",
    "\n",
    "#倒排表\n",
    "#统计每个用户参加的活动   / 每个活动参加的用户\n",
    "eventsForUser = defaultdict(set)\n",
    "usersForEvent = defaultdict(set)\n",
    "f = open(dataPath+\"train.csv\", 'rb')\n",
    "\n",
    "for filename in [dataPath+\"train.csv\",dataPath+\"test.csv\"]:\n",
    "    f = open(filename, 'rb')   #以二进制只读方式打开文件,'r'为以字符流方式打开\n",
    "    \n",
    "    #忽略第一行（列名字）\n",
    "    f.readline().strip().decode().split(\",\")      #byte型字符串转成str型   ，readline后，行指针下移一行，故下面for line 从第二行开始\n",
    "#     f.readline().strip().split(\",\".encode(encoding='utf-8'))\n",
    "    for line in f:    #对每条记录\n",
    "        cols = line.strip().decode().split(\",\")\n",
    "        uniqueUsers.add(cols[0])   #第一列为用户ID\n",
    "        uniqueEvents.add(cols[1])   #第二列为活动ID\n",
    "    f.close()\n",
    "\n",
    "\n",
    "n_uniqueUsers = len(uniqueUsers)\n",
    "n_uniqueEvents = len(uniqueEvents)\n",
    "\n",
    "print(\"number of uniqueUsers :%d\" % n_uniqueUsers)\n",
    "print(\"number of uniqueEvents :%d\" % n_uniqueEvents)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 由于使用set存储id，可知 train.csv和test.csv中，出现的不同的user有3391个，events有13418个"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.3生成用户对某个活动是否感兴趣的稀疏矩阵(备用)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "#用户关系矩阵表，可用于后续LFM/SVD++处理的输入\n",
    "#按行列生成一个稀疏矩阵，记录用户对活动感兴趣\n",
    "userEventScores = ss.dok_matrix((n_uniqueUsers, n_uniqueEvents))    \n",
    "userIndex = dict()\n",
    "eventIndex = dict()\n",
    "\n",
    "#重新编码用户索引字典\n",
    "for i, u in enumerate(uniqueUsers):\n",
    "    userIndex[u] = i                #map的key为userId，value为uniqueUsers的下标。即标记某个userId在set中的位置\n",
    "    \n",
    "#重新编码活动索引字典    \n",
    "for i, e in enumerate(uniqueEvents):\n",
    "    eventIndex[e] = i               #map的key为eventId，value为uniqueEvents的下标。即标记某个eventId在set中的位置\n",
    "\n",
    "n_records = 0\n",
    "ftrain = open(dataPath+\"train.csv\", 'rb')\n",
    "ftrain.readline()\n",
    "for line in ftrain:\n",
    "    cols = line.strip().decode().split(\",\")\n",
    "    i = userIndex[cols[0]]  #用户\n",
    "    j = eventIndex[cols[1]] #活动\n",
    "    \n",
    "    eventsForUser[i].add(j)    #该用户参加了这个活动\n",
    "    usersForEvent[j].add(i)    #该活动被用户参加\n",
    "        \n",
    "    score = int(cols[4])\n",
    "    userEventScores[i, j] = score\n",
    "ftrain.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<3391x13418 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 4131 stored elements in Dictionary Of Keys format>"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "userEventScores"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.4user和event的关系对，保存文件(备用)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "##统计每个用户参加的活动，后续用于将用户朋友参加的活动影响到用户\n",
    "pickle.dump(eventsForUser, open(dataPath +\"PE_eventsForUser.pkl\", 'wb'))\n",
    "##统计活动参加的用户\n",
    "pickle.dump(usersForEvent, open(dataPath +\"PE_usersForEvent.pkl\", 'wb'))\n",
    "\n",
    "#保存用户-活动关系矩阵R，以备后用\n",
    "sio.mmwrite(dataPath +\"PE_userEventScores\", userEventScores)\n",
    "\n",
    "\n",
    "#保存用户索引表\n",
    "pickle.dump(userIndex, open(dataPath +\"PE_userIndex.pkl\", 'wb'))\n",
    "#保存活动索引表\n",
    "pickle.dump(eventIndex, open(dataPath +\"PE_eventIndex.pkl\", 'wb'))\n",
    "\n",
    "    \n",
    "# 为了防止不必要的计算，我们找出来所有关联的用户 或者 关联的event\n",
    "# 所谓的关联用户，指的是至少在同一个event上有行为的用户pair\n",
    "# 关联的event指的是至少同一个user有行为的event pair\n",
    "uniqueUserPairs = set()\n",
    "uniqueEventPairs = set()\n",
    "for event in uniqueEvents:\n",
    "    i = eventIndex[event]\n",
    "    users = usersForEvent[i]\n",
    "    if len(users) > 2:\n",
    "        uniqueUserPairs.update(itertools.combinations(users, 2))\n",
    "        \n",
    "for user in uniqueUsers:\n",
    "    u = userIndex[user]\n",
    "    events = eventsForUser[u]\n",
    "    if len(events) > 2:\n",
    "        uniqueEventPairs.update(itertools.combinations(events, 2))\n",
    " \n",
    "#保存用户-事件关系对索引表\n",
    "pickle.dump(uniqueUserPairs, open(dataPath +\"FE_uniqueUserPairs.pkl\", 'wb'))\n",
    "pickle.dump(uniqueEventPairs, open(dataPath +\"PE_uniqueEventPairs.pkl\", 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "57494"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(uniqueEventPairs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.5导入原始数据集，用1.2中记录下来的uniqueEvents,过滤数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataPath = '../JupyterData/'\n",
    "#读取训练数据\n",
    "data = pd.read_csv(dataPath+'events.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 3137972 entries, 0 to 3137971\n",
      "Columns: 110 entries, event_id to c_other\n",
      "dtypes: float64(2), int64(103), object(5)\n",
      "memory usage: 2.6+ GB\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(3137972, 110)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.info()\n",
    "data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 原始数据总共3137972行，110维"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>start_time</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip</th>\n",
       "      <th>country</th>\n",
       "      <th>lat</th>\n",
       "      <th>lng</th>\n",
       "      <th>c_1</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>684921758</td>\n",
       "      <td>3647864012</td>\n",
       "      <td>2012-10-31T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>244999119</td>\n",
       "      <td>3476440521</td>\n",
       "      <td>2012-11-03T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3928440935</td>\n",
       "      <td>517514445</td>\n",
       "      <td>2012-11-05T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2582345152</td>\n",
       "      <td>781585781</td>\n",
       "      <td>2012-10-30T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1051165850</td>\n",
       "      <td>1016098580</td>\n",
       "      <td>2012-09-27T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 110 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     event_id     user_id                start_time city state  zip country  \\\n",
       "0   684921758  3647864012  2012-10-31T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "1   244999119  3476440521  2012-11-03T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "2  3928440935   517514445  2012-11-05T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "3  2582345152   781585781  2012-10-30T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "4  1051165850  1016098580  2012-09-27T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "\n",
       "   lat  lng  c_1   ...     c_92  c_93  c_94  c_95  c_96  c_97  c_98  c_99  \\\n",
       "0  NaN  NaN    2   ...        0     1     0     0     0     0     0     0   \n",
       "1  NaN  NaN    2   ...        0     0     0     0     0     0     0     0   \n",
       "2  NaN  NaN    0   ...        0     0     0     0     0     0     0     0   \n",
       "3  NaN  NaN    1   ...        0     0     0     0     0     0     0     0   \n",
       "4  NaN  NaN    1   ...        0     0     0     0     0     0     0     0   \n",
       "\n",
       "   c_100  c_other  \n",
       "0      0        9  \n",
       "1      0        7  \n",
       "2      0       12  \n",
       "3      0        8  \n",
       "4      0        9  \n",
       "\n",
       "[5 rows x 110 columns]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 根据event_id筛选，1.2步骤中已将需要提取的event_id存在uniqueEvents中"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "data=data[data['event_id'].isin(uniqueEvents)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "13418"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data['event_id'].unique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 可见新data中，event_id的不同取值共有13418种，与1.2中计算出的event数目一致，筛选成功。保存为select_by_events.csv备用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv(dataPath+'select_by_events.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2.数据预处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.1 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(13418, 111)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataPath = '../JupyterData/'\n",
    "data=pd.read_csv(dataPath+'select_by_events.csv')\n",
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>event_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>start_time</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip</th>\n",
       "      <th>country</th>\n",
       "      <th>lat</th>\n",
       "      <th>lng</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>684921758</td>\n",
       "      <td>3647864012</td>\n",
       "      <td>2012-10-31T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>244999119</td>\n",
       "      <td>3476440521</td>\n",
       "      <td>2012-11-03T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>3928440935</td>\n",
       "      <td>517514445</td>\n",
       "      <td>2012-11-05T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>2582345152</td>\n",
       "      <td>781585781</td>\n",
       "      <td>2012-10-30T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>1051165850</td>\n",
       "      <td>1016098580</td>\n",
       "      <td>2012-09-27T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 111 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0    event_id     user_id                start_time city state  \\\n",
       "0           0   684921758  3647864012  2012-10-31T00:00:00.001Z  NaN   NaN   \n",
       "1           1   244999119  3476440521  2012-11-03T00:00:00.001Z  NaN   NaN   \n",
       "2           2  3928440935   517514445  2012-11-05T00:00:00.001Z  NaN   NaN   \n",
       "3           3  2582345152   781585781  2012-10-30T00:00:00.001Z  NaN   NaN   \n",
       "4           4  1051165850  1016098580  2012-09-27T00:00:00.001Z  NaN   NaN   \n",
       "\n",
       "   zip country  lat  lng   ...     c_92  c_93  c_94  c_95  c_96  c_97  c_98  \\\n",
       "0  NaN     NaN  NaN  NaN   ...        0     1     0     0     0     0     0   \n",
       "1  NaN     NaN  NaN  NaN   ...        0     0     0     0     0     0     0   \n",
       "2  NaN     NaN  NaN  NaN   ...        0     0     0     0     0     0     0   \n",
       "3  NaN     NaN  NaN  NaN   ...        0     0     0     0     0     0     0   \n",
       "4  NaN     NaN  NaN  NaN   ...        0     0     0     0     0     0     0   \n",
       "\n",
       "   c_99  c_100  c_other  \n",
       "0     0      0        9  \n",
       "1     0      0        7  \n",
       "2     0      0       12  \n",
       "3     0      0        8  \n",
       "4     0      0        9  \n",
       "\n",
       "[5 rows x 111 columns]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 去掉作业要求聚类外的前1+9列特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(13418, 101)"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data=data.drop(data.columns[0],axis=1)\n",
    "data=data.drop(\"event_id\",axis=1)\n",
    "data=data.drop(\"user_id\",axis=1)\n",
    "data=data.drop(\"start_time\",axis=1)\n",
    "data=data.drop(\"city\",axis=1)\n",
    "data=data.drop(\"state\",axis=1)\n",
    "data=data.drop(\"zip\",axis=1)\n",
    "data=data.drop(\"country\",axis=1)\n",
    "data=data.drop(\"lat\",axis=1)\n",
    "data=data.drop(\"lng\",axis=1)\n",
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>c_1</th>\n",
       "      <th>c_2</th>\n",
       "      <th>c_3</th>\n",
       "      <th>c_4</th>\n",
       "      <th>c_5</th>\n",
       "      <th>c_6</th>\n",
       "      <th>c_7</th>\n",
       "      <th>c_8</th>\n",
       "      <th>c_9</th>\n",
       "      <th>c_10</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 101 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   c_1  c_2  c_3  c_4  c_5  c_6  c_7  c_8  c_9  c_10   ...     c_92  c_93  \\\n",
       "0    2    0    2    0    0    0    0    0    0     0   ...        0     1   \n",
       "1    2    0    2    0    0    0    0    0    0     0   ...        0     0   \n",
       "2    0    0    0    0    0    0    0    0    0     0   ...        0     0   \n",
       "3    1    0    2    1    0    0    0    0    0     0   ...        0     0   \n",
       "4    1    1    0    0    0    0    0    2    0     0   ...        0     0   \n",
       "\n",
       "   c_94  c_95  c_96  c_97  c_98  c_99  c_100  c_other  \n",
       "0     0     0     0     0     0     0      0        9  \n",
       "1     0     0     0     0     0     0      0        7  \n",
       "2     0     0     0     0     0     0      0       12  \n",
       "3     0     0     0     0     0     0      0        8  \n",
       "4     0     0     0     0     0     0      0        9  \n",
       "\n",
       "[5 rows x 101 columns]"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.2 PCA降维"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(13418, 20)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "#对数据进行PCA降维,此处保留20个特征\n",
    "pca = PCA(n_components=20)\n",
    "pca.fit(data)\n",
    "\n",
    "data_pca = pca.transform(data)\n",
    "\n",
    "# 降维后的特征维数\n",
    "print(data_pca.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3聚类"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.1定义评价函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import MiniBatchKMeans\n",
    "import time\n",
    "from sklearn import metrics\n",
    "# 定义一个函数，一个参数点（聚类数据为K）的模型，评价聚类算法性能\n",
    "#此处只有训练集，且只有x\n",
    "def K_cluster_analysis(K, x_data):\n",
    "    start = time.time()\n",
    "    \n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    mb_kmeans.fit(x_data)\n",
    "    \n",
    "    # 在训练集和测试集上测试\n",
    "    y_pred = mb_kmeans.fit_predict(x_data)\n",
    "    #y_val_pred = mb_kmeans.predict(X_val)\n",
    "    \n",
    "    #以前两维特征打印训练数据的分类结果\n",
    "    #plt.scatter(X_train[:, 0], X_train[:, 1], c=y_pred)\n",
    "    #plt.show()\n",
    "\n",
    "    # K值的评估标准\n",
    "    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    #CH_score = metrics.calinski_harabaz_score(X_train,mb_kmeans.predict(X_train))\n",
    "    CH_score = metrics.silhouette_score(x_data,y_pred )\n",
    "    \n",
    "    #也可以在校验集上评估K\n",
    "    #v_score = metrics.v_measure_score(y_val, y_val_pred)\n",
    "    \n",
    "    end = time.time()\n",
    "    print(\"CH_score: {}, time elaps:{}\".format(CH_score, int(end-start)))\n",
    "    \n",
    "    return CH_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.2 粗调参数K"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 10\n",
      "CH_score: 0.4136012581712272, time elaps:6\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 0.2529721695464385, time elaps:6\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 0.25407718686383135, time elaps:6\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 0.19665095788614156, time elaps:6\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 0.21476564099453335, time elaps:6\n",
      "K-means begin with clusters: 60\n",
      "CH_score: 0.17590009971750062, time elaps:6\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [10, 20, 30,40,50,60]\n",
    "CH_scores = []\n",
    "for K in Ks:\n",
    "    ch= K_cluster_analysis(K, data_pca)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 可以看出分成10类的CH_Score最高，不过仍然偏低，仅为0.41，进一步搜索K的范围"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.3 进一步搜索更优的K"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 2\n",
      "CH_score: 0.7968293349015135, time elaps:6\n",
      "K-means begin with clusters: 4\n",
      "CH_score: 0.5864464756008543, time elaps:6\n",
      "K-means begin with clusters: 6\n",
      "CH_score: 0.5018296559977805, time elaps:7\n",
      "K-means begin with clusters: 8\n",
      "CH_score: 0.4695560788219376, time elaps:7\n",
      "K-means begin with clusters: 10\n",
      "CH_score: 0.4647907125739385, time elaps:7\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [2, 4, 6,8,10]\n",
    "CH_scores = []\n",
    "for K in Ks:\n",
    "    ch= K_cluster_analysis(K, data_pca)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### K=2时，CH_score已经达到0.798,较为理想"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.4绘制结果图形"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x1e2ca5c0>]"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3XeYVGWa/vHvQxPNIG0iCOOAA4iilowRYUzoOKDj6IKr4s9BTAQdExgGxTwmHERcZHSdoOgPdxFXV0ygjpEGQQVEEXVoUWlBxUx69o/3IEXT0NVQ3W+F+3NddXXXqVPUDZfeJ9Q572vujoiIFId6sQOIiEjdUemLiBQRlb6ISBFR6YuIFBGVvohIEVHpi4gUEZW+iEgRUemLiBQRlb6ISBGpHztAZc2bN/c2bdrEjiEiklemT5/+ubuXVrdezpV+mzZtKCsrix1DRCSvmNlHmayn0zsiIkVEpS8iUkRU+iIiRSSj0jeznmY2z8zmm9nQKl5vbWZTzOwNM3vTzI5Je21Y8r55ZnZUNsOLiEjNVPtFrpmVAKOBI4ByYJqZTXL3OWmrXQE87O5jzKwj8ATQJvm9D9AJ2AV4xszau/uqbP9FRESkepns6XcF5rv7AndfDowHeldax4Ftkt+3BRYlv/cGxrv7j+7+ATA/+fNERCSCTEq/BbAw7Xl5sizdVcApZlZO2MsfVIP3YmYDzKzMzMoqKioyjC4iIjWVSelbFcsqz7HYF/hPd28JHAP8zczqZfhe3H2su6fcPVVaWu29BVX65hsYNgw++GCT3i4iUhQyKf1yoFXa85asPX2zxu+BhwHc/RWgMdA8w/dmxZdfwqhRMHhwbfzpIiKFIZPSnwa0M7O2ZtaQ8MXspErr/As4DMDMOhBKvyJZr4+ZNTKztkA74PVshU/XsiVcfTX8z//ApMrpREQEyKD03X0lMBCYDMwlXKUz28xGmFmvZLULgTPNbBbwIHC6B7MJRwBzgCeB82rzyp3Bg6FTp/Dzu+9q61NERPKXua93ij2qVCrlmzP2zosvQrducNllcN11WQwmIpLDzGy6u6eqW6/g7sg95BA47TS4+WZ4553YaUREckvBlT7An/4EW24JAwdCjh3IiIhEVZClv+OO4dTOs8/CQw/FTiMikjsKsvQBzjoL9t0X/vAHWLYsdhoRkdxQsKVfUgJjxsCnn8Lw4bHTiIjkhoItfYD99gt7/KNGwaxZsdOIiMRX0KUPcP310KwZnHsurF4dO42ISFwFX/pNm4areV5+Ge6/P3YaEZG4Cr70IVy3f/DBcPHFsGRJ7DQiIvEURenXqwd33RUGZbvssthpRETiKYrSB+jcGYYMgXvugddei51GRCSOoil9gKuugp13Dl/qrtKEjSJShIqq9LfeGm6/HWbMgLvvjp1GRKTuFVXpA5x4Ihx+OFx+OXz2Wew0IiJ1q+hK3wxGj4bvvw9X84iIFJOiK32A9u1D4f/tb/D887HTiIjUnaIsfQiXbrZpE77UXbEidhoRkbpRtKW/xRbw5z/DnDkwcmTsNCIidaNoSx/gN7+BXr3ChOoLF8ZOIyJS+4q69AHuuCMMxHbBBbGTiIjUvoxK38x6mtk8M5tvZkOreP12M5uZPN41sy/TXluV9tqkbIbPhjZt4Ior4JFH4MknY6cREald5tVMImtmJcC7wBFAOTAN6Ovuczaw/iBgb3c/I3n+jbtvlWmgVCrlZWVlma6eFT/+CHvuGe7SffttaNy4Tj9eRGSzmdl0d09Vt14me/pdgfnuvsDdlwPjgd4bWb8v8GBmMXNDo0bh2v333w/DMIuIFKpMSr8FkP41Z3mybD1mtivQFngubXFjMyszs1fN7LhNTlrLDj8c+vQJk668/37sNCIitSOT0rcqlm3onFAfYIK7pw9n1jo55DgZGGlmu633AWYDkg1DWUVFRQaRasett0LDhjBoEFRz1ktEJC9lUvrlQKu05y2BRRtYtw+VTu24+6Lk5wJgKrB35Te5+1h3T7l7qrS0NINItWOXXWDECPjf/4WJE6PFEBGpNZmU/jSgnZm1NbOGhGJf7yocM9sdaAq8krasqZk1Sn5vDhwEVPkFcK4YODB8qTtkCHz7bew0IiLZVW3pu/tKYCAwGZgLPOzus81shJn1Slu1LzDe170cqANQZmazgCnAjRu66idX1K8fZtlauBCuuSZ2GhGR7Kr2ks26FuOSzaqccUYYkG3WLOjYMXYaEZGNy+Ylm0XpppvCpCvnnacvdUWkcKj0N6C0FG64AaZOhQceiJ1GRCQ7VPob0b8/dO0KF14IX30VO42IyOZT6W9ESUn4UnfxYrjyythpREQ2n0q/GvvuGyZaGT0a3ngjdhoRkc2j0s/AtddC8+ZwzjlhGGYRkXyl0s/AdtvBLbfAa6/BvffGTiMisulU+hk65RTo1g0uvRQ+/zx2GhGRTaPSz5BZOK+/bBkMGxY7jYjIplHp18Aee8D558O4cfDKK9WvLyKSa1T6NTR8OLRoEb7UXbkydhoRkZpR6dfQVluFydRnzQrX8IuI5BOV/ib47W/hqKPChOqffBI7jYhI5lT6m8AM7rwTli+Hiy6KnUZEJHMq/U3085+HyzcfeACee6769UVEcoFKfzMMHQo/+1kYfnn58thpRESqp9LfDE2awKhR8M47cNttsdOIiFRPpb+ZjjkGjj8+TK340Uex04iIbJxKPwtGjgw/zz8/bg4Rkeqo9LOgdWv44x9h4kR4/PHYaURENkylnyUXXAAdOsCgQfD997HTiIhULaPSN7OeZjbPzOab2dAqXr/dzGYmj3fN7Mu01/qZ2XvJo182w+eShg3DgGwffAA33hg7jYhI1czdN76CWQnwLnAEUA5MA/q6+5wNrD8I2NvdzzCzZkAZkAIcmA7s6+5fbOjzUqmUl5WVbcrfJSf8+7/DhAnw9tvQrl3sNCJSLMxsurunqlsvkz39rsB8d1/g7suB8UDvjazfF3gw+f0o4Gl3X5oU/dNAzww+M2/dcgs0bhxO81SzPRURqXOZlH4LYGHa8/Jk2XrMbFegLbDmHtWM31sodt45XL45eTI88kjsNCIi68qk9K2KZRvah+0DTHD3VTV5r5kNMLMyMyurqKjIIFJuO/dc6NIlXML59dex04iIrJVJ6ZcDrdKetwQWbWDdPqw9tZPxe919rLun3D1VWlqaQaTcVr9+GHb5449hxIjYaURE1sqk9KcB7cysrZk1JBT7pMormdnuQFMgfU6pycCRZtbUzJoCRybLCt4BB0D//nD77eFLXRGRXFBt6bv7SmAgoaznAg+7+2wzG2FmvdJW7QuM97TLgdx9KXANYcMxDRiRLCsKN9wA224bTvfoS10RyQXVXrJZ1/L9ks3Kxo2DM8+E+++H006LnUZEClU2L9mUzXDGGbD//nDxxfDFBu9OEBGpGyr9WlavHowZA59/HqZXFBGJSaVfB7p0gYEDQ/lPnx47jYgUM5V+HRkxAnbcEc45B1atqn59EZHaoNKvI9tuC7feCtOmhS93RURiUOnXob59oUcPGDYMCuDGYxHJQyr9OmQWhl/++mu49NLYaUSkGKn061iHDnDhhXDfffDSS7HTiEixUelHcOWV0KpV+FJ35crYaUSkmKj0I9hyS7jjDnjrLRg1KnYaESkmKv1IjjsOjjkmTKj+8cex04hIsVDpR2IGf/4zrFgRzvGLiNQFlX5Eu+0Gl10GDz0EzzwTO42IFAOVfmSXXBLK/7zz4McfY6cRkUKn0o+scWO48054990wqbqISG1S6eeAnj3hhBPg2mvhgw9ipxGRQqbSzxG33w4lJTBkSOwkIlLIVPo5olUruOoqeOwxmLTeDMQiItmh0s8hQ4ZAp04weDB8913sNCJSiFT6OaRBA7jrLvjoI7j++thpRKQQqfRzTLducOqp8Kc/wbx5sdOISKHJqPTNrKeZzTOz+WY2dAPrnGRmc8xstpk9kLZ8lZnNTB46W52Bm2+GLbYIUyy6x04jIoWk2tI3sxJgNHA00BHoa2YdK63TDhgGHOTunYDz017+3t27JI9e2YteuHbcEa67Ltyl+/DDsdOISCHJZE+/KzDf3Re4+3JgPNC70jpnAqPd/QsAd1+c3ZjF5+yzYZ994IILYNmy2GlEpFBkUvotgIVpz8uTZenaA+3N7CUze9XMeqa91tjMypLlx21m3qJRUgJjxsCnn4ZLOUVEsiGT0rcqllU+01wfaAd0B/oC48xsu+S11u6eAk4GRprZbut9gNmAZMNQVqHJY3/StSsMGBBG43zzzdhpRKQQZFL65UCrtOctgUVVrPOou69w9w+AeYSNAO6+KPm5AJgK7F35A9x9rLun3D1VWlpa479EIbv+emjaNMyytXp17DQiku8yKf1pQDsza2tmDYE+QOWrcCYCPQDMrDnhdM8CM2tqZo3Slh8EzMlW+GLQrFm4fPPll+H++2OnEZF8V23pu/tKYCAwGZgLPOzus81shJmtuRpnMrDEzOYAU4CL3X0J0AEoM7NZyfIb3V2lX0P9+sGBB4ZhmJcujZ1GRPKZeY5dCJ5KpbysrCx2jJzz5pvhap7+/eHuu2OnEZFcY2bTk+9PN0p35OaJPfeEQYNg7Fh4/fXYaUQkX6n088jVV8NOO4UvdVetip1GRPKRSj+PbLMN3HYbzJgB//EfsdOISD5S6eeZf/s3OOywMKH6Z5/FTiMi+Ualn2fMYPToMN7+JZfETiMi+Ualn4d23x0uvhj++ld44YXYaUQkn6j089Tll8Ouu8K558KKFbHTiEi+UOnnqS22CGPyzJ4Nd9wRO42I5AuVfh7r1Qt+85swCmd5eew0IpIPVPp57o47wjX7F1wQO4mI5AOVfp5r2xauuAImTIDJk2OnEZFcp9IvABddBO3bhzl1f/ghdhoRyWUq/QLQqBHceSfMnx+GYRYR2RCVfoE44gg46aQw6cr778dOIyK5SqVfQG67DRo0gMGDIcdGzBaRHKHSLyAtWoSROJ94Ah59NHYaEclFKv0CM2gQdO4c9va//TZ2GhHJNSr9AtOgAdx1FyxcCNdeGzuNiOQalX4BOvhgOP10uOUWmDs3dhoRySUq/QJ1002w1VZw3nn6UldE1lLpF6gddoAbboApU+DBB2OnEZFckVHpm1lPM5tnZvPNbOgG1jnJzOaY2WwzeyBteT8zey959MtWcKnemWfCfvvBhRfCV1/FTiMiuaDa0jezEmA0cDTQEehrZh0rrdMOGAYc5O6dgPOT5c2A4cAvga7AcDNrmtW/gWxQSUn4Uvezz+CPf4ydRkRyQSZ7+l2B+e6+wN2XA+OB3pXWORMY7e5fALj74mT5UcDT7r40ee1poGd2oksmUik455wwTMPMmbHTiEhsmZR+C2Bh2vPyZFm69kB7M3vJzF41s541eC9mNsDMysysrKKiIvP0kpFrr4Xttw/lv3p17DQiElMmpW9VLKt8PUh9oB3QHegLjDOz7TJ8L+4+1t1T7p4qLS3NIJLURNOm4fLNV1+F++6LnUZEYsqk9MuBVmnPWwKLqljnUXdf4e4fAPMIG4FM3it14NRT4ZBD4JJL4PPPY6cRkVgyKf1pQDsza2tmDYE+wKRK60wEegCYWXPC6Z4FwGTgSDNrmnyBe2SyTOqYWfhS96uvYNiw2GlEJJZqS9/dVwIDCWU9F3jY3Web2Qgz65WsNhlYYmZzgCnAxe6+xN2XAtcQNhzTgBHJMolgjz3g/PNh3LhwqkdEio95jt2umUqlvKysLHaMgvX119ChA5SWwrRpUL9+7EQikg1mNt3dU9Wtpztyi8zWW8Ptt4fLN8eMiZ1GROqaSr8I/e53cOSRYUL1Tz6JnUZE6pJKvwiZhZu1fvwRDjgAnn02diIRqSsq/SLVrh089xw0bAiHHw5nnw3LlsVOJSK1TaVfxA48EGbNgosugnvuCVf3PPVU7FQiUptU+kWuSRO4+WZ46SXYcks46ijo31+jcooUKpW+ALD//vDGG3DppWGohk6dwgTrIlJYVPryk8aN4cYb4ZVXYNtt4de/DtMufvFF7GQiki0qfVlP164wYwZcfjn8/e9hr/+xx2KnEpFsUOlLlRo1CkMyv/YaNG8OvXqFQduWahANkbym0peN2ndfKCsLM2+NHw8dO8LEibFTicimUulLtRo2hKuvDmP17LQTHH889O2rIZpF8pFKXzLWpUso/quvhkceCXv9EybETiUiNaHSlxpp0CCc6pk+HVq1ghNPhJNOgsWLq3+viMSn0pdN0rlzGJP/uuvg0UfDFT4PPQQ5NlK3iFSi0pdN1qABXHZZuLyzbVvo0wdOOAE+/TR2MhHZEJW+bLZOneDll+Gmm8JdvJ06wT/+ob1+kVyk0pesqF8/TLo+cya0bw+nnALHHafx+kVyjUpfsuoXv4B//hNuvTWM2NmxI/z1r9rrF8kVKn3JupIS+MMfwrDNnTpBv35w7LHw8cexk4mISl9qTfv28PzzMHIkTJkSNgD33qu9fpGYMip9M+tpZvPMbL6ZDa3i9dPNrMLMZiaP/mmvrUpbPimb4SX3lZTAkCHw5puw117w+9/D0UfDwoWxk4kUp2pL38xKgNHA0UBHoK+Zdaxi1YfcvUvyGJe2/Pu05b2yE1vyzc9/Hvb2R40K5/w7dQqzdWmvX6RuZbKn3xWY7+4L3H05MB7oXbuxpBDVqwcDB8Jbb0EqBQMGwJFHwocfxk4mUjwyKf0WQPrBeHmyrLITzOxNM5tgZq3Sljc2szIze9XMjqvqA8xsQLJOWUVFRebpJS+1bQvPPANjxoS7ejt3Dr+vXh07mUjhy6T0rYpllQ/KHwPauPuewDPA/WmvtXb3FHAyMNLMdlvvD3Mf6+4pd0+VlpZmGF3yWb16cPbZ8PbbYarGc8+Fww+HBQtiJxMpbJmUfjmQvufeEliUvoK7L3H3H5On9wD7pr22KPm5AJgK7L0ZeaXA7LpruJ7/nnvCuP2dO4fz/trrF6kdmZT+NKCdmbU1s4ZAH2Cdq3DMbOe0p72AucnypmbWKPm9OXAQMCcbwaVwmEH//jB7NnTrBoMHQ48eMH9+7GQihafa0nf3lcBAYDKhzB9299lmNsLM1lyNM9jMZpvZLGAwcHqyvANQliyfAtzo7ip9qVKrVmHsnvvuCzd27blnuMZ/1arYyUQKh3mOXTOXSqW8rKwsdgyJ7OOP4ayz4PHH4cADw01du+8eO5VI7jKz6cn3pxulO3IlJ7VoAY89FsbtmTs3zNp1663a6xfZXCp9yVlmcOqp4Vz/kUfCRRfBwQeHjYCIbBqVvuS8nXeGiRPDGP3vvgt77x3G7l+5MnYykfyj0pe8YAYnnwxz5sCvfw1Dh4Zz/bNnx04mkl9U+pJXdtwRJkwI8/F+8AHss0+Yp3fFitjJRPKDSl/yjhmcdFLY6z/uOLjiinBX75tvxk4mkvtU+pK3SkvDHv+ECVBeHgZxGzFCe/0iG6PSl7x3wgnh3P6JJ8Lw4bDffmGuXhFZn0pfCkLz5uHqnokT4bPPQvEPHw7Ll8dOJpJbVPpSUHr3Dnv9ffuGUz2pFEyfHjuVSO5Q6UvBadYs3Mn72GOwZAn88pdw+eXw44/Vv1ek0Kn0pWAde2zY6z/tNLj++nB557RpsVOJxKXSl4K23XZhsLYnnoBly8KlnUOHwg8/xE4mEodKX4rC0UeHWbrOOCMM4bD33vDKK7FTidQ9lb4UjW23DTN0PfUUfPcdHHRQGMTt++9jJxOpOyp9KTpHHBH2+s86KwzX3KULvPRS7FQidUOlL0Vp661hzBh49tlwLf8hh8D558O338ZOJlK7VPpS1H71K3jrLTjvPLjjDthrL3jhhdipRGqPSl+K3lZbwahRMHUquMOhh8KgQfDNN7GTiWSfSl8kceihYaTOIUNg9OgwMftzz8VOJZJdGZW+mfU0s3lmNt/Mhlbx+ulmVmFmM5NH/7TX+pnZe8mjXzbDi2TbllvCyJHhFE/9+nDYYXDuufD117GTiWRHtaVvZiXAaOBooCPQ18w6VrHqQ+7eJXmMS97bDBgO/BLoCgw3s6ZZSy9SSw4+OIzUeeGFcPfd0LkzPPNM7FQimy+TPf2uwHx3X+Duy4HxQO8M//yjgKfdfam7fwE8DfTctKgidWuLLeCWW8LlnI0bh0s9u3cPA7m9+KLG8pH8lEnptwAWpj0vT5ZVdoKZvWlmE8ysVQ3fK5KzDjgA3ngjlP2yZXDVVdCtGzRtCocfHqZrfOklDeMs+SGT0rcqlnml548Bbdx9T+AZ4P4avBczG2BmZWZWVlFRkUEkkbrVpAlceSXMmBFG7pw4EQYMgM8/D9M1Hnxw2AgcdRTccAO8+qpm8JLcVD+DdcqBVmnPWwKL0ldw9yVpT+8Bbkp7b/dK751a+QPcfSwwFiCVSq23URDJJU2bhnH7eycnOZcsgeefD5d8TpkCl10Wlm+1VdgY9OgRTgvts0/4clgkJnPfeMeaWX3gXeAw4GNgGnCyu89OW2dnd/8k+f144FJ33z/5Inc6sE+y6gxgX3dfuqHPS6VSXlZWthl/JZG4KirCRmDKlLAhmDMnLN9mm3Dnb/fuYUPQpQuUlMRMKoXEzKa7e6q69ard73D3lWY2EJgMlAD3uvtsMxsBlLn7JGCwmfUCVgJLgdOT9y41s2sIGwqAERsrfJFCUFoKv/tdeECYvnHq1LVHAo8/HpZvu234bmDNkcBee0E93TkjtazaPf26pj19KXSLFq17JPDee2F506bhBrE1RwJ77KGNgGQu0z19lb5IZOXl6x4JLFgQlm+/fdgI9OgRHh07glV1aYQIKn2RvPWvf63dAEyZAh99FJaXloajgDVHAr/4hTYCspZKX6RAfPjh2lNBU6bAwuTOlx13XLsB6NED2rXTRqCYqfRFCpB7OP2TfiSwKLmAepdd1j0S2G03bQSKiUpfpAi4w/z56x4JfPppeK1ly3WPBNq00UagkKn0RYqQO8ybt3YDMHUqLF4cXmvdeu0GoHt32HXXiEEl61T6IoI7zJ27dgMwdWoYOgKgbdt1jwRatowYVDabSl9E1rN6NcyevfZI4PnnYWlyu+Ruu617JLDLLjGTSk2p9EWkWqtXhzmC13wp/MIL8OWX4bX27dceCXTvDjvtFDOpVEelLyI1tmoVzJq19kjghRfCcNIQ7gtYcyRw6KGwww5Ro0olKn0R2WwrV4YZxNYcCbz44toJ4zt1WnsUcOih0Lx51KhFT6UvIlm3ciVMn772SOCf/4Rvvw2vde689kigWzdo1ixq1KKj0heRWrdiBZSVrT0SeOkl+P77cD/AXnuFU0JNmoSpJzf0c2OvrfnZoEHsv2nuU+mLSJ1bvhxef33tJaLl5fDdd2FDsObnpigpyXwDkck6G1q3SZP8nehGpS8iOccdfvhh/Q1B+s+NvVaTdTd14voGDbK3EdnYOk2aZHfo7KxNoiIiki1mawuvtq1aVf0GpiYbmu++C1NjLly4/mubOh9yo0brbhBSKXjwwez+O1Sm0heRglRSAltuGR61beXKsAGo6Uak8rI2bWo/q0pfRGQz1a8PW28dHrlOk7GJiBQRlb6ISBFR6YuIFJGMSt/MeprZPDObb2ZDN7Le78zMzSyVPG9jZt+b2czkcXe2gouISM1V+0WumZUAo4EjgHJgmplNcvc5ldbbGhgMvFbpj3jf3btkKa+IiGyGTPb0uwLz3X2Buy8HxgO9q1jvGuBPwA9ZzCciIlmUSem3ABamPS9Plv3EzPYGWrn7/1Tx/rZm9oaZPW9mh2x6VBER2VyZXKdf1VTKP43dYGb1gNuB06tY7xOgtbsvMbN9gYlm1sndl63zAWYDgAEArVu3zjC6iIjUVCalXw60SnveEliU9nxrYA9gqpkB7ARMMrNe7l4G/Ajg7tPN7H2gPbDO4DruPhYYC2BmFWb20ab9dQBoDny+Ge+vLcpVM8pVM8pVM4WYK6Op7qsdcM3M6gPvAocBHwPTgJPdffYG1p8KXOTuZWZWCix191Vm9jPgRaCzuy/N+K9RQ2ZWlsmgQ3VNuWpGuWpGuWqmmHNVu6fv7ivNbCAwGSgB7nX32WY2Aihz90kbeXs3YISZrQRWAWfXZuGLiMjGZTT2jrs/ATxRadkfN7Bu97TfHwEe2Yx8IiKSRYV4R+7Y2AE2QLlqRrlqRrlqpmhz5dwkKiIiUnsKcU9fREQ2oCBK38xamdkUM5trZrPNbEjsTABm1tjMXjezWUmuq2NnSmdmJcmNc1XdVBeFmX1oZm8lYzXlzLyZZradmU0ws3eS/84OiJ0JwMx2TxvbaqaZLTOz83Mg1wXJf/Nvm9mDZtY4diYAMxuSZJod+9/JzO41s8Vm9nbasmZm9rSZvZf8bJrtzy2I0gdWAhe6ewdgf+A8M+sYOROEexR+5e57AV2Anma2f+RM6YYAc2OHqEIPd++SY5fU3QE86e6/APYiR/7d3H1e8m/VBdgX+A7475iZzKwFYRyulLvvQbjqr0/MTABmtgdwJmFomb2AY82sXcRI/wn0rLRsKPCsu7cDnk2eZ1VBlL67f+LuM5Lfvyb8D9li4++qfR58kzxtkDxy4ksUM2sJ/BoYFztLrjOzbQiXH/8FwN2Xu/uXcVNV6TDCAIebc3NjttQHmiT3+WzBujd0xtIBeNXdv3P3lcDzwPGxwrj7C0DlS9h7A/cnv98PHJftzy2I0k9nZm2AvVl/tM8oklMoM4HFwNPunhO5gJHAJcDq2EEqceApM5ueDM+RC34GVAD3JafDxplZHcy8WmN9gFqeVrt67v4xcAvwL8JQLF+5+1NxUwHwNtDNzLY3sy2AY1h3tIFcsKO7fwJhZxbYIdsfUFClb2ZbEe4LOL/y+D6xuPuq5NC7JdA1OcSMysyOBRa7+/TYWapwkLvvAxxNOE3XLXYgwl7rPsAYd98b+JZaOOzeHGbWEOgF/P8cyNKUsMfaFtgF2NLMTombCtx9LnAT8DTwJDCLcGq4qBRM6ZtZA0Lh/8Pd/yt2nsqS0wFTWf8cXgwHAb3M7EPCUNm/MrO/x40UuPui5OdiwrnprnETAWH8qfK0o7QJhI1ALjkamOHun8UOAhwOfODuFe6+AvgJwPR3AAABVElEQVQv4MDImQBw97+4+z7u3o1wauW92Jkq+czMdgZIfi7O9gcUROlbGOntL8Bcd78tdp41zKzUzLZLfm9C+J/hnbipwN2HuXtLd29DOCXwnLtH3xMzsy2TyXhITp8cSTgkj8rdPwUWmtnuyaLDgDkbeUsMfcmBUzuJfwH7m9kWyf+bh5EjX3yb2Q7Jz9bAb8mdf7M1JgH9kt/7AY9m+wMyGoYhDxwEnAq8lZw/B7gsGT4ipp2B+5PZx+oBD29gzgEJdgT+OxmttT7wgLs/GTfSTwYB/0hOoywA/l/kPD9Jzk8fAZwVOwuAu79mZhOAGYTTJ2+QO3fAPmJm2wMrgPPc/YtYQczsQaA70NzMyoHhwI3Aw2b2e8LG88Ssf67uyBURKR4FcXpHREQyo9IXESkiKn0RkSKi0hcRKSIqfRGRIqLSFxEpIip9EZEiotIXESki/wcpQDTs5E3pkgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x1f462e48>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 可看出分成2类时的效果最好(每次聚类的结果会有些许浮动，不过重复N次试验后，K=2的结果仍然为最佳)\n",
    "#### 之前2.2中PCA降维时保留了20个特征。 特征数不一样的话，最后最佳的K值也会变化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
